Du Chemin Texts

Author

Richard Freedman

Published

March 25, 2025

Du Chemin Text Network

Code
import pandas as pd
import os
import bs4 
from bs4 import BeautifulSoup
from lxml import etree # lxml is imported for xpath examples

from pathlib import Path
import requests
import plotly.express as px
import plotly.graph_objects as go
from collections import Counter
import math
import numpy as np
from itertools import chain

# Visualisation dependencies
import matplotlib.pyplot as plt
import textwrap
# from pyvis.network import Network as net
from copy import deepcopy
import networkx as nx
from pyvis import network as net
from community import community_louvain

import warnings
warnings.filterwarnings('ignore')


import warnings
warnings.filterwarnings('ignore')

import uuid
import glob

from datetime import datetime
import chardet
import random
import re
from itertools import combinations

import subprocess

Load the TEI Texts

View it here

  • Specify URL of TEI
  • Load the TEI as Soup object
Code
# define the url on git where the TEI is found
url = 'https://raw.githubusercontent.com/RichardFreedman/DDL_2022_Pandas_Jupyter/refs/heads/main/Miller_DC_Texts_TEI_5_13.xml'
Code
# load the xml file
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

Create Dataframe based on each Line of Each Poem

Code
list_dicts = []

# find each poem
poems = soup.find_all('div', type="poem")
for poem in poems:
    l = 0
    s = 0
    
    # some poems have only one stanza, so there is no <lg> tag
    if poem.find('lg') == False: 
        temp_dict = {}
        lines = poem.find_all('l')
        for line in lines:
            l = l+1
            temp_dict['dc_id'] = poem['xml:id']
            temp_dict['rhyme_scheme'] = poem['rhyme']
            temp_dict['meter'] = poem['met']
            temp_dict['stanza_id'] = 1
            temp_dict['line_id'] = l
            temp_dict['verse'] = line.get_text().strip()
            list_dicts.append(temp_dict)
    
    # others have two or more <lg> tags
    else:
        stanzas = poem.find_all('lg')
        for stanza in stanzas:
            s = s+1
            lines = stanza.find_all('l')
            for line in lines:
                temp_dict = {}
                l = l+1
                temp_dict['dc_id'] = poem['xml:id']
                temp_dict['rhyme_scheme'] = poem['rhyme']
                temp_dict['meter'] = poem['met']
                temp_dict['stanza_id'] = s
                temp_dict['line_id'] = l
                temp_dict['verse'] = line.get_text().strip()
                list_dicts.append(temp_dict)
Code
df = pd.DataFrame(list_dicts)
df
dc_id rhyme_scheme meter stanza_id line_id verse
0 DC0101 ababbcbc 10 1 1 Qui souhaitez avoir tout le plaisir
1 DC0101 ababbcbc 10 1 2 Qu'un amy peult vouloir honnestement
2 DC0101 ababbcbc 10 1 3 Prenez exemple à mon chaste desir
3 DC0101 ababbcbc 10 1 4 Et vous mirez en mon contentement.
4 DC0101 ababbcbc 10 2 5 Mais qui vouldroit audacieusement
... ... ... ... ... ... ...
2723 DC1616 ababbcbc 8 2 8 A tout jamais le mien te donne.
2724 DC1618 abab 8 1 1 C’est bien disner quant on eschappe,
2725 DC1618 abab 8 1 2 Sans des-bourcer pas-un denier,
2726 DC1618 abab 8 1 3 Puis torcher son nez à la nappe,
2727 DC1618 abab 8 1 4 Et dirɇ adieu au tavernier.

2728 rows × 6 columns

Add Rhyme for Individual Lines

Code
# adding rhyme values for each line of verse
grouped_df = df.groupby('dc_id')
    
# Get the first rhyme scheme for each group
first_rhyme_schemes = grouped_df['rhyme_scheme'].first()

# break down each rhyme scheme into a list of characters then combine    
combined_list_rhymes = list(chain(*[list(x) for x in first_rhyme_schemes]))

# and add back to the original df
df['rhyme'] = combined_list_rhymes
df
dc_id rhyme_scheme meter stanza_id line_id verse rhyme rhyme_word
0 DC0101 ababbcbc 10 1 1 Qui souhaitez avoir tout le plaisir a plaisir
1 DC0101 ababbcbc 10 1 2 Qu'un amy peult vouloir honnestement b honnestement
2 DC0101 ababbcbc 10 1 3 Prenez exemple à mon chaste desir a desir
3 DC0101 ababbcbc 10 1 4 Et vous mirez en mon contentement. b contentement
4 DC0101 ababbcbc 10 2 5 Mais qui vouldroit audacieusement b audacieusement
... ... ... ... ... ... ... ... ...
2723 DC1616 ababbcbc 8 2 8 A tout jamais le mien te donne. c donne
2724 DC1618 abab 8 1 1 C’est bien disner quant on eschappe, a eschappe
2725 DC1618 abab 8 1 2 Sans des-bourcer pas-un denier, b denier
2726 DC1618 abab 8 1 3 Puis torcher son nez à la nappe, a nappe
2727 DC1618 abab 8 1 4 Et dirɇ adieu au tavernier. b tavernier

2728 rows × 8 columns

Inspect an indivdual poem

Code
# an individual poem

df[df['dc_id'] == 'DC1511']
dc_id rhyme_scheme meter stanza_id line_id verse rhyme rhyme_word
2569 DC1511 ababbccbcbd 10 1 1 Je le sçay bien que la mort fait cognoistre, a cognoistre
2570 DC1511 ababbccbcbd 10 1 2 Et ressentir du grand seigneur la grace, b grace
2571 DC1511 ababbccbcbd 10 1 3 La mort nous fait derechef vivrɇ et naistre, a naistre
2572 DC1511 ababbccbcbd 10 1 4 Malgré peché, qui tant de maux nous brasse: b brasse
2573 DC1511 ababbccbcbd 10 1 5 Mais non pourtant pourray-je me démettre b démettre
2574 DC1511 ababbccbcbd 10 2 6 Du grief travail, qui mon cueur outrepasse, c outrepasse
2575 DC1511 ababbccbcbd 10 2 7 Je le sçay bien quelque conseil,quelque bien q... c face
2576 DC1511 ababbccbcbd 10 2 8 Je ne sçaurois en confort me remettre, b remettre
2577 DC1511 ababbccbcbd 10 2 9 Dont prieray Dieu, que ce grand dueil efface, c efface
2578 DC1511 ababbccbcbd 10 2 10 De reconfort le seul autheur, et maistre. b maistre
2579 DC1511 ababbccbcbd 10 3 11 Je le sçay bien d bien

Distribution of Rhyme Schemes

Code
rhyme_scheme_counts = df['rhyme_scheme'].value_counts()

rhyme_scheme_counts.sort_values(ascending=False).head(20)
rhyme_scheme
ababbcbc                  1224
ababbccdcd                 670
ababcdcd                    64
abab                        60
abba                        52
ababbcc                     35
abaabbccdcd                 22
aabbccddeeffgghheejjkk      22
ababbccdccd                 22
ababbccdcdebbeeffghhg       21
ababbcccdc                  20
aabba                       20
ababbccbcb                  20
abcccdcabcccdcab            16
aabab                       15
abcdcdabededab              14
ababbcbcdecede              14
aabb                        12
aabcdcaabcdc                12
abcbdedefgfg                12
Name: count, dtype: int64

A Bar Chart of Rhyme Schemes

Code
fig = px.bar(x = rhyme_scheme_counts.index, 
       y=rhyme_scheme_counts,
             title="Rhyme Scheme Distribution in Du Chemin Chansons nouvelles")

# option to rename the y axis
fig.update_yaxes(title_text='Number of Chansons')
fig.update_xaxes(title_text='Rhyme Scheme')


# Set width and height in pixels
fig.update_layout(width=800, height=600) 
fig.show()

Finding Rhyme Pairs

  • We first clean out all non-alphabetic characters
  • Then split all the verses into a list of strings (of individual words)
Code
# function to strip out all non-alphabetic characters
def remove_non_alpha_chars(text):
    return re.sub("\W*$", "", text)
def remove_parentheses(text):
    return re.sub(r'[()]', '', text)
Code
# each verse as a list of words
df.verse.apply(remove_non_alpha_chars).str.split()
0       [Qui, souhaitez, avoir, tout, le, plaisir]
1       [Qu'un, amy, peult, vouloir, honnestement]
2         [Prenez, exemple, à, mon, chaste, desir]
3         [Et, vous, mirez, en, mon, contentement]
4           [Mais, qui, vouldroit, audacieusement]
                           ...                    
2723        [A, tout, jamais, le, mien, te, donne]
2724    [C’est, bien, disner, quant, on, eschappe]
2725           [Sans, des-bourcer, pas-un, denier]
2726       [Puis, torcher, son, nez, à, la, nappe]
2727              [Et, dirɇ, adieu, au, tavernier]
Name: verse, Length: 2728, dtype: object
Code
# the rhyme word is the last in each line

df["rhyme_word"] = df.verse.apply(remove_non_alpha_chars).str.split().str.get(-1)
df["rhyme_word"] = df['rhyme_word'].apply(remove_parentheses)

How many unique rhyme words?

Code
len(df["rhyme_word"].unique())
1595
Code

rhymes_df = df.groupby(["dc_id", "rhyme"]).rhyme_word.apply(list).reset_index()
rhymes_df
dc_id rhyme rhyme_word
0 DC0101 a [plaisir, desir]
1 DC0101 b [honnestement, contentement, audacieusement, h...
2 DC0101 c [tient, appartient]
3 DC0102 a [plaist, desplait]
4 DC0102 b [marchander, demander]
... ... ... ...
1072 DC1616 a [heureux, yeux]
1073 DC1616 b [l’heure, demeure, t’asseure, demeure]
1074 DC1616 c [bonne, donne]
1075 DC1618 a [eschappe, nappe]
1076 DC1618 b [denier, tavernier]

1077 rows × 3 columns

Find all the Songs in which Each Rhyme Appears

Code
words_df = df.groupby(["rhyme_word"]).dc_id.apply(list).reset_index()
words_df
rhyme_word dc_id
0 Aenée [DC1408]
1 Anglois [DC1410]
2 Angloise [DC1410]
3 Arrondelle [DC1113, DC1114]
4 Cieux [DC1202]
... ... ...
1590 vuide [DC1508]
1591 yeulx [DC0424, DC0508, DC0605, DC0706, DC0708, DC071...
1592 yeux [DC0302, DC1108, DC1202, DC1314, DC1517, DC1616]
1593 zelle [DC0226]
1594 œuvre [DC0414]

1595 rows × 2 columns

Code
# for instance, the pieces associated with Arrondelle

words_df[words_df['rhyme_word'] == 'Arrondelle']['dc_id'].astype(str)
3    ['DC1113', 'DC1114']
Name: dc_id, dtype: object

Finding all the Combinations of Rhymes in Each Piece

  • Here we find all two-word combinations of all the words sharing a rhyme sound within each piece. The pairs are thus ‘internal’ to the individual songs
Code
rhymes['pairs'] = rhymes.rhyme_word.apply(lambda x: list(combinations(x, 2)))
rhymes
dc_id rhyme rhyme_word pairs
0 DC0101 a [plaisir, desir] [(plaisir, desir)]
1 DC0101 b [honnestement, contentement, audacieusement, h... [(honnestement, contentement), (honnestement, ...
2 DC0101 c [tient, appartient] [(tient, appartient)]
3 DC0102 a [plaist, desplait] [(plaist, desplait)]
4 DC0102 b [marchander, demander] [(marchander, demander)]
... ... ... ... ...
1072 DC1616 a [heureux, yeux] [(heureux, yeux)]
1073 DC1616 b [l’heure, demeure, t’asseure, demeure] [(l’heure, demeure), (l’heure, t’asseure), (l’...
1074 DC1616 c [bonne, donne] [(bonne, donne)]
1075 DC1618 a [eschappe, nappe] [(eschappe, nappe)]
1076 DC1618 b [denier, tavernier] [(denier, tavernier)]

1077 rows × 4 columns

Code
# exploding these unpacks the list of pairs in each case, for a long dataset

rhymes_exploded = rhymes.explode('pairs').dropna()
# rhymes_exploded.pairs.to_list()
Code
# pair counts 

pair_counts = rhymes_exploded['pairs'].value_counts()
pair_counts
pairs
(dire, martyre)         8
(desire, dire)          6
(grace, passe)          5
(martyre, desire)       5
(envie, vie)            5
                       ..
(picque, impudique)     1
(pudique, impudique)    1
(pudique, picque)       1
(vains, conjoinctz)     1
(denier, tavernier)     1
Name: count, Length: 2205, dtype: int64
Code
# function used in the graph to get the DC IDs for each word

def get_dc_ids(rhyme_word, words_df):
    
    rhyme_word = rhyme_word.lower()
    # Filter the DataFrame based on rhyme_words
    filtered_df = df[df['rhyme_word'].str.lower().isin([rhyme_word])]
    
    # Extract dc_ids from the filtered rows
    result = []
    for _, row in filtered_df.iterrows():
        if isinstance(row['dc_id'][0], list):  # Check if dc_id is a list
            result.extend(row['dc_id'][0])  # Extend with all elements in the list
        else:
            result.append(row['dc_id'])  # Append if it's not a list
    
    return list(set(result))  # Remove duplicates and convert back to list

# Example usage
rhyme_word = 'Arrondelle'
get_dc_ids(rhyme_word, words_df)
['DC0714', 'DC1113', 'DC1114', 'DC0805']

Graph 1: Without Size or Hover Data

Code
# set graph options:
graph_height = 800
graph_width = 800
detect_louvain_communities = True
add_forceAtlas2Based_physics = True
# Add nodes and assign weights to edges

# Create an empty NetworkX graph
G = nx.Graph()

for index, row in rhymes_exploded.iterrows():
    pairs = row['pairs']
    
    if isinstance(pairs, tuple):
        node1, node2 = pairs
        
    # Adding nodes if they don't exist already
    if node1 not in G.nodes:
        # G.add_node(node1)
        piece_ids = get_dc_ids(node1, words_df)

        G.add_node(node1, title=f"{piece_ids}", size = len(piece_ids))
    
    if node2 not in G.nodes:
        # G.add_node(node2)
        piece_ids = get_dc_ids(node2, words_df)
        G.add_node(node2, title=f"{piece_ids}", size = len(piece_ids))
    # Adding edge with weight
    G.add_edge(node1, node2)

if detect_louvain_communities == True:
    def add_communities(G):
        G = deepcopy(G)
        partition = community_louvain.best_partition(G)
        nx.set_node_attributes(G, partition, "group")
        return G
        
    G = add_communities(G)

# set display parameters
network_graph = net.Network(notebook=True,
                   width=graph_height,
                   height=graph_height,
                   bgcolor="black", 
                   font_color="white")

# Set the physics layout of the network

if add_forceAtlas2Based_physics == True:

    network_graph.set_options("""
    {
    "physics": {
    "enabled": true,
    "forceAtlas2Based": {
        "springLength": 1
    },
    "solver": "forceAtlas2Based"
    }
    }
    """)

network_graph.from_nx(G)
# # # return the network
network_graph.show("Du Chemin Simple.html")